# This code replicates the results for 
# Study 2 of Inclusionary & Exclusionary Preferences 

# Version: September 18, 2025 
# Author: Marika Landau-Wells (mlw@berkeley.edu)

# All procedures follow pre-reg on OSF (https://osf.io/m83n7/)
# For ease of interpretation, hypothesis labels follow the article's format instead of the pre-registration
# H4a = pre-reg H2, H4b = pre-reg H3, HS1 = pre-reg H1

# Note: the datasets used in Study 2 come from other authors
# For access to the NASIS2018 survey, contact Nebraska's Bureau of Sociological Research (BOSR) 
# bosr@unl.edu, 402-472-3672
# The Human Values Behind Arguments Dataset can be downloaded from:
# https://github.com/webis-de/ACL-22?tab=readme-ov-file

# All code runs in R but re-running the embeddings requires a Python installation
# and the sentence-transformers library, which can be installed via conda or pip
# To run the Python code from R, you can use the reticulate package (see Section 9)

# Originally created with R version 4.4.3 (runs as of April 9, 2025)
# Running groundhog will ensure the correct versions of all packages are loaded
# for replication purposes

library(groundhog)

pkgs = c("dplyr", "stringi", "ggplot2", "tokenizers")
groundhog.library(pkgs, "2025-04-09")

# To load independently, use these versions:
#library(dplyr) #v 1.1.4
#library(stringi) #v 1.8.7
#library(ggplot2) #v 3.5.1
#library(tokenizers) #v 0.3.0

# Set as needed 
#setwd()


# Contents:
# 1. Import NASIS 2018 Embeddings  
# 2. Import HVBA raw data & Embeddings
# 3. Calculate cosine Similarities
# 4. Main Text: Test of H4a
# 5. Main Text: Test of H4b
# 6. Main Text: Exploratory Analyses
# 7. Appendix F: Test of HS1 
# 8. Generate Figure 4
# 9. OPTIONAL: Regenerate embeddings for HVBA dataset

# Clear the environment
gc()
rm(list = ls())

###########################
# 1. Import NASIS Embeddings ####
d.nasis.embed <- read.csv("NASIS_Embeddings_n602.csv", stringsAsFactors = F)
# Correct N (cleaned) is 602
# Note that IDs were generated before cleaning and correspond to rows in the original data
# Embeddings generated from Question 38 in the original dataset (LGBT8)

#table(d.nasis.embed$Group)
# Endorse  Oppose 
# 291     311

###########################

###########################
# 2. Import HVBA raw data & Embeddings ####

## Import HVBA Full Datasets ####
# Kiesel et al. (2022), "Identifying the Human Values behind Arguments."
# Note: the arguments come from respondents in multiple regions
# We filter to US only for this task
# Two levels Schwartz model are used in pre-registered filtering (requires 2 versions of data)

# Download full dataset first (webis-argvalues-22.zip) from:
# https://zenodo.org/records/6855004


# Import raw argument text
d.arg.txt <- readr::read_tsv("webis-argvalues-22/arguments.tsv")
# Level 2 labels (e.g., benevolence: caring)
d.arg.l2 <- readr::read_tsv("webis-argvalues-22/labels-level2.tsv")
# Level 3 labels (e.g., self-transcendence)
d.arg.l3 <- readr::read_tsv("webis-argvalues-22/labels-level3.tsv")

# Re-name variables
d.arg.l2 <- d.arg.l2 %>%
  rename("SelfDirection_thought" = "Self-direction: thought",
         "SelfDirection_action" = "Self-direction: action",
         "Security_personal" = "Security: personal",
         "Security_societal" = "Security: societal",
         "Power_dominance" = "Power: dominance",
         "Power_resources" = "Power: resources",
         "Conformity_rules" = "Conformity: rules",
         "Conformity_inter" = "Conformity: interpersonal",
         "Benevolence_caring" = "Benevolence: caring",
         "Benevolence_depend" = "Benevolence: dependability",
         "Universalism_concern" = "Universalism: concern",
         "Universalism_object" = "Universalism: objectivity",
         "Universalism_nature" = "Universalism: nature",
         "Universalism_tolerance" = "Universalism: tolerance")

d.arg.l3 <- d.arg.l3 %>%
  rename("SelfEnhancement" = "Self-enhancement",
         "SelfTranscendence" = "Self-transcendence",
         "Openness_to_change" = "Openness to change")

# Merge by argument id to filter to usable data
d.arg <- left_join(d.arg.txt, d.arg.l2)
d.arg <- left_join(d.arg, d.arg.l3)

## Reduce HVBA Dataset to US Responses ####

# Filter to US only (N = 5020)
d.arg.usa <- filter(d.arg, Part == "usa")

# Tidy the environment
rm(d.arg, d.arg.l2, d.arg.l3, d.arg.txt)

# Check overlap on L3 (Schwartz's Higher order values)
d.arg.usa$dims <- rowSums(d.arg.usa[,c("SelfEnhancement",
                                       "SelfTranscendence",
                                       "Conservation",
                                       "Openness_to_change")] )

table(d.arg.usa$dims)
#1    2    3    4 
#741 2302 1565  412 

# Dataset of items which code as only 1 of the L3s 
d.arg.dim1 <- filter(d.arg.usa, dims == 1)
# N = 741

# Dataset of items coded as 2 L3s, minus contradictions 
# Contradictions are defined as L3 dimensions treated as oppositional by Schwartz
# Self-enhancement vs. self-transcendence
# Conservation vs. Openness to Change

# filter to dims = 2
d.arg.d2 <- filter(d.arg.usa, dims == 2)
# N = 2302

# drop inconsistent combinations
d.arg.d2 <- filter(d.arg.d2, !(SelfTranscendence == 1 & 
                                 SelfEnhancement == 1))
# N = 2091
d.arg.d2 <- filter(d.arg.d2, !(Openness_to_change == 1 & 
                                 Conservation == 1))
# N = 1914
# Note 25 joint codings are for Hedonism (which is theorized as cross-cat)
# table(d.arg.d2$Hedonism, d.arg.d2$Openness_to_change)

## Clean HVBA Text ####

# Dim1 texts
txt.argd1 <- stri_enc_toutf8(d.arg.dim1$Premise, 
                             is_unknown_8bit = T)
# Use paragraphs to tokenize in case there are multiple sentences
txt.argd1 <- tokenize_paragraphs(txt.argd1) #output is a list

# Unlist the paragraphs and save each within a character vector
content.vec <- c()
for (j in 1:length(txt.argd1)){
  content.vec[j] <- txt.argd1[[j]]
}

# Create dataframe clean text field
d.argd1.df <- data.frame(d.arg.dim1)
# Rename index to match other datasets
d.argd1.df <- d.argd1.df %>%
  rename("item_id" = "Argument.ID")
# Drop some useless cols
d.argd1.df <- select(d.argd1.df,
                     -Part, -Usage, -Premise)
# Add the new text
d.argd1.df$premise <- as.character(content.vec)

# Dim2 arguements
txt.argd2 <- stri_enc_toutf8(d.arg.d2$Premise, 
                             is_unknown_8bit = T)
# Use paragraphs to tokenize in case there are multiple sentences
txt.argd2 <- tokenize_paragraphs(txt.argd2) #output is a list

# Unlist the paragraphs and save each within a character vector
content.vec <- c()
for (j in 1:length(txt.argd2)){
  content.vec[j] <- txt.argd2[[j]]
}

# Create dataframe with a clean text field
d.argd2.df <- data.frame(d.arg.d2)
# Rename index to match other datasets
d.argd2.df <- d.argd2.df %>%
  rename("item_id" = "Argument.ID")
# Drop some useless cols
d.argd2.df <- select(d.argd2.df,
                     -Part, -Usage, -Premise)
# Add the new text
d.argd2.df$premise <- as.character(content.vec)

# Write out .csvs if re-running embeddings
#write.csv(d.argd1.df, file = "Args_Unique_df.csv", row.names = F) #For running the embeddings model 
#write.csv(d.argd2.df, file = "Args_2Dims_df.csv", row.names = F) #For running the embeddings model 

# Clean the environment
rm(d.arg.dim1, d.arg.d2, d.arg.usa)


## Import HVBA Embeddings ####

# Uniquely labeled premises (N=741)
d.argd1.embeddings <- read.csv('./Embeddings/Args_Unique_df_embeddings.csv', header = F)
#head(d.argd1.embeddings)[,1:5]

# Combine the embeddings and the original data about the sentences
d.argd1.embed <- cbind(d.argd1.df, d.argd1.embeddings)
#colnames(d.argd1.embed)[1:36]


# Jointly labeled premises (N=1941)
d.arg_2dims.embeddings <- read.csv('./Embeddings/Args_2Dims_df_embeddings.csv', header = F)
#head(d.arg_2dims.embeddings)[,1:5]

# Combine the embeddings and the original data from HVBA
d.argd2.embed <- cbind(d.argd2.df, d.arg_2dims.embeddings)
#colnames(d.argd2.embed)[1:36]


## Add Other-Regarding Values variable ####

# Add column for OtherRegard based on pre-registered criteria
# For Unique: SelfDirection_Action = 1 or Universalism_concern = 1
univ_c.bin <- ifelse(d.argd1.embed$Universalism_concern == 1 & 
                       d.argd1.embed$Universalism_tolerance == 0 & 
                       d.argd1.embed$Universalism_nature == 0 & 
                       d.argd1.embed$Universalism_object == 0, 1, 0)
sd_a.bin <- ifelse(d.argd1.embed$SelfDirection_action == 1 &
                     d.argd1.embed$SelfDirection_thought == 0 &
                     d.argd1.embed$Stimulation == 0, 1, 0)
dim1.bin <- c(univ_c.bin + sd_a.bin)
sum(dim1.bin) #109
length(dim1.bin) #741 (correct)

# Add to embedding df
d.argd1.embed$OtherRegard <- dim1.bin


# For 2Dim: SelfDirection_Action = 1 and Universalism_concern = 1
ucsd.bin <- ifelse(d.argd2.embed$Universalism_concern == 1 & 
                     d.argd2.embed$SelfDirection_action == 1, 1, 0)
sum(ucsd.bin) #72 
length(ucsd.bin) #1914 (correct)

# Add to embedding df
d.argd2.embed$OtherRegard <- ucsd.bin

# Combine the 2 sets of embeddings
d.arg.embed <- rbind(d.argd1.embed, d.argd2.embed)
sum(d.arg.embed$OtherRegard) #181

# Generate 2 vectors of arg IDs to use for selecting cols
vec.ORV <- select(d.arg.embed, item_id, OtherRegard) %>%
  filter(OtherRegard == 1) %>%
  select(-OtherRegard) %>% c() %>% unlist()
vec.Other <- select(d.arg.embed, item_id, OtherRegard) %>%
  filter(OtherRegard == 0) %>%
  select(-OtherRegard) %>% c() %>% unlist()


###################################################

###################################################
# 3. Cosine Similarities ####

## Helper Functions ####
# Helper function to calculate the similarity matrix for NASIS
# assumes 'ID' column is used consistently to label documents
# also assumes no columns start with "V" except embeddings
embed2sim <- function(x){
  # x is a dataframe of embeddings and metadata
  # Previous functions return a grouped tibble - just make sure this is a dataframe
  x <- data.frame(x)
  # save the embeddings as df (note this is vars V1:V768)
  embed.x <- dplyr::select(x, starts_with("V"))
  # save the meta-data from the embedding file, which is every other col
  meta.x <- dplyr::select(x, !starts_with("V"))
  # Turn the embeddings into a matrix
  embed.mat <- data.matrix(embed.x)
  # Calculate cosine similarity within the document set (i.e., embed.mat x 2)
  sim.x <- text2vec::sim2(embed.mat, embed.mat, method = "cosine")
  # Assign column names based on the level of the embeddings
  colnames(sim.x) <- meta.x[,'ID']
  sim.x.df <- data.frame(sim.x)
  sim.x.df <- cbind(meta.x, sim.x.df)
  return(sim.x.df)
}


# Helper function to generate within- and between- avgs
# All original data:
nasis.true.sim <- function(x) {
  # x = the similarity dataframe
  # Generate the correct rows for each comparison; 
  x.fav <- filter(x, Group == "Endorse") 
  x.opp <- filter(x, Group == "Oppose") 
  # create vectors of ids to select
  id.fav <- as.character(x.fav$ID)
  id.opp <- as.character(x.opp$ID)
  # create symmetric sim frames
  fav_fav <- select(x.fav, ID, all_of(id.fav)) %>%
    mutate(n_txt = ncol(.) - 2,
           #this removes the item_id column and the reference text
           total = rowSums(across(where(is.numeric))) - n_txt - 1,
           #this removes the value of 1 within the row where the document matches itself        
           mean_sim = total/n_txt) %>%
    summarise(mean_txt_sim = mean(mean_sim))
  
  fav_opp <- select(x.fav, ID, all_of(id.opp)) %>%
    mutate(n_txt = ncol(.) - 1,
           #this removes the ref_text column
           total = rowSums(across(where(is.numeric))) - n_txt,
           mean_sim = total/n_txt) %>%
    summarise(mean_txt_sim = mean(mean_sim))
  
  opp_opp <- select(x.opp, ID, all_of(id.opp)) %>%
    mutate(n_txt = ncol(.) - 2,
           #this removes the ref_text column
           total = rowSums(across(where(is.numeric))) - n_txt - 1,
           mean_sim = total/n_txt) %>%
    summarise(mean_txt_sim = mean(mean_sim))
  
  summary.df <- data.frame(
    X = c("Endorse", "Endorse", "Oppose"), 
    Y = c("Endorse", "Oppose","Oppose"),
    Value = c(fav_fav$mean_txt_sim, 
              fav_opp$mean_txt_sim,
              opp_opp$mean_txt_sim)
  )
  return(summary.df)
}

# Helper function takes an Nasis df and a values df
# enter nasis as x (will be rows), vals as y (will be cols)
embed2sim_ref <- function(x, y){
  # x is a dataframe of NASIS embeddings and metadata
  # y is a dataframe of Values embeddings and metadata
  # Previous functions return a grouped tibble - just make sure this is a dataframe
  x <- data.frame(x)
  y <- data.frame(y)
  # save the embeddings as df (note this is vars V1:V768)
  embed.x <- dplyr::select(x, starts_with("V"))
  embed.y <- dplyr::select(y, starts_with("V"))
  # save the meta-data from the embedding file, which is every other col
  meta.x <- dplyr::select(x, !starts_with("V"))
  meta.y <- dplyr::select(y, !starts_with("V"))
  # Turn the embeddings into a matrix
  embed.mat.x <- data.matrix(embed.x)
  embed.mat.y <- data.matrix(embed.y)
  # Calculate cosine similarity within the document set 
  sim.xy <- text2vec::sim2(embed.mat.x, embed.mat.y, method = "cosine")
  # Assign column names based on the level of the embeddings
  colnames(sim.xy) <- meta.y[,'item_id']
  sim.xy.df <- data.frame(sim.xy)
  sim.xy.df <- cbind(meta.x, sim.xy.df)
  return(sim.xy.df)
}



## Between NASIS & HVBA Similarity (H4a, H4b) ####
# Calculate similarity for 602 responses vs. 2655 premises
nasis_arg_sim <- embed2sim_ref(d.nasis.embed, 
                                d.arg.embed)
#nasis_arg_sim[1:6,14:26]

## Within NASIS Similarities (HS1) ####

d.nasis.sims <- embed2sim(d.nasis.embed)
# Examine:
#d.nasis.sims[1:6,14:26]

nasis.polpos <- nasis.true.sim(d.nasis.sims)
nasis.polpos
#     X       Y     Value
#1 Endorse Endorse 0.1991707
#2 Endorse  Oppose 0.1896399
#3  Oppose  Oppose 0.2155813

wb_oe = nasis.polpos[3,3] - nasis.polpos[2,3]
wb_oe
wb_eo = nasis.polpos[1,3] - nasis.polpos[2,3]
wb_eo




###################################################

###################################################
# 4. Main Text: Test of H4a ####

## H4a: Opponents' justifications will be more similar to Other-regarding values justifications ####
# Filter by Group, then select the columns (OtherRegarding)
opp_OR_sim <- filter(nasis_arg_sim, 
                     Group == "Oppose") %>%
  select(-one_of(vec.Other))
opp_OtherVals_sim <- filter(nasis_arg_sim, 
                            Group == "Oppose") %>%
  select(-one_of(vec.ORV))
# Calculate Averages per text
n.opp.txts <- length(opp_OR_sim$ID)
opp_OR_sim_avg <- select(opp_OR_sim, all_of(vec.ORV)) %>%
  rowMeans()
opp_OtherVals_sim_avg <- select(opp_OtherVals_sim, 
                                all_of(vec.Other)) %>%
  rowMeans()

# Test for Normality
shapiro.test(opp_OR_sim_avg) #normal
shapiro.test(opp_OtherVals_sim_avg) #normal

# T-Test for H4a: OR_sim > OtherVals_sim 
# Paired bc each text is compared to both value sets
t.test(x = opp_OR_sim_avg,
       y = opp_OtherVals_sim_avg,
       alternative = "greater",
       paired = T)
#t = 23.139, df = 310, p-value < 2.2e-16

#summary(opp_OR_sim_avg)
#     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
#-0.005518  0.078077  0.107830  0.107107  0.135538  0.199930
#sd(opp_OR_sim_avg)
#0.03954073
#summary(opp_OtherVals_sim_avg)
#    Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
#-0.008158  0.063177  0.080230  0.081352  0.102530  0.160339
#sd(opp_OtherVals_sim_avg)
# 0.02887404

# Results: H4a confirmed

###################################################

###################################################
# 5. Main Text: Test H4b ####

## H4b: Opponents' justifications will be more similar to ORV justifications than Endorsers' ####
# Get Endorse OR sim
end_OR_sim <- filter(nasis_arg_sim, 
                     Group == "Endorse") %>%
  select(-one_of(vec.Other))

# Calculate Averages per text
n.end.txts <- length(end_OR_sim$ID)
end_OR_sim_avg <- select(end_OR_sim, all_of(vec.ORV)) %>%
  rowMeans()

# Test for Normality
shapiro.test(end_OR_sim_avg) #normal

# T-Test for H4b: Opp OR_sim > End OR_sim 
# Not paired
t.test(x = opp_OR_sim_avg,
       y = end_OR_sim_avg,
       alternative = "greater")
#1.7659, df = 599.92, p-value = 0.03896

# Descriptive stats
#summary(opp_OR_sim_avg)
# Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
#-0.005518  0.078077  0.107830  0.107107  0.135538  0.199930 
#sd(opp_OR_sim_avg)
#0.03954073

#summary(end_OR_sim_avg)
# Min.   1st Qu.    Median      Mean   3rd Qu.   Max. 
# 0.01212 0.07754 0.10431 0.10157 0.12476 0.20433 
#sd(end_OR_sim_avg)
#0.03742015

# Results: H4b confirmed 


###################################################

###################################################
# 6. Main Text: Exploratory Analyses ####
# Find Most Similar Value categories (not preregistered)

## Identify Arguments with unique value codes ####
# vector of value names
value_names <- c("SelfDirection_action", "SelfDirection_thought",
                 "Stimulation", "Hedonism", "Achievement",
                 "Power_dominance", "Power_resources", "Face",
                 "Security_personal", "Security_societal", "Tradition",
                 "Conformity_rules", "Conformity_inter" , "Humility",
                 "Benevolence_caring", "Benevolence_depend", 
                 "Universalism_concern", "Universalism_nature",
                 "Universalism_tolerance", "Universalism_object")

# Identify arguments with a single L2 value code (n = 313)
d.argd1_all <- select(d.argd1.embed, item_id, one_of(value_names))
d.argd1_all$d20 <- rowSums(d.argd1_all[,-1])

table(d.argd1_all$d20)
#1   2   3   4   5 
#313 313  94  19   2 

# Reduce to L2 unique arguments
d.argl2_1 <- filter(d.argd1_all, d20 == 1)

# Pull unique argument IDs
solo_arg_ids <- c(d.argl2_1$item_id)

## Identify maximally similar arguments ####
# Filter the similarity data by group
end_sim <- filter(nasis_arg_sim, 
                  Group == "Endorse") %>%
  select(ID, starts_with("A")) 

opp_sim <- filter(nasis_arg_sim, 
                  Group == "Oppose") %>%
  select(ID, starts_with("A")) 

end_sim <- select(end_sim, ID, one_of(solo_arg_ids)) #291
opp_sim <- select(opp_sim, ID, one_of(solo_arg_ids)) #311

# Identify the maxes
end_max_val <- apply(end_sim[,-1], 1, max)
end_max_arg <- colnames(end_sim[,-1])[max.col(end_sim[,-1])] 
opp_max_val <- apply(opp_sim[,-1], 1, max)
opp_max_arg <- colnames(opp_sim[,-1])[max.col(opp_sim[,-1])] 

# Consolidate 
end_max_df <- data.frame(ID = end_sim$ID,
                         Group = "Endorse",
                         max_sim = end_max_val,
                         item_id = end_max_arg)
opp_max_df <- data.frame(ID = opp_sim$ID,
                         Group = "Oppose",
                         max_sim = opp_max_val,
                         item_id = opp_max_arg)
all_max_df <- rbind(end_max_df, opp_max_df)
all_max_df$Group <- as.factor(all_max_df$Group)

# head(all_max_df)

# Merge with Value Labels
all_max_args <- inner_join(all_max_df, d.argl2_1, by = "item_id") # 602 rows is correct

# Summary by group
all_max_vals <- select(all_max_args, Group, one_of(value_names)) %>%
  group_by(Group) %>%
  summarise(across(everything(), ~ sum(., na.rm = TRUE)))
all_max_vals <- data.frame(all_max_vals)
all_max_vals 

## Group-Level Summaries ####

# Long data
all_max_mwide <-  select(all_max_args, ID, Group, max_sim, one_of(value_names))
all_max_mlong <- tidyr::pivot_longer(all_max_mwide, 
                                     cols = one_of(value_names),
                                     names_to = "Val_Name",
                                     values_to = "Correct")  
all_max_mlong <- filter(all_max_mlong, Correct == 1)
all_max_mlong$Val_Name <- factor(all_max_mlong$Val_Name)
head(all_max_mlong)

# Group averages
all_max_summ <- all_max_mlong %>% 
  group_by(Group, Val_Name) %>%
  summarise(msim = mean(max_sim))

# Group max tables
opp_tc <- filter(all_max_mlong, Group == "Oppose")
# Create an order to the factor based on abs frequency
opp_tc_tab <- table(opp_tc$Val_Name)
opp_tc_df <- as.data.frame(opp_tc_tab)
colnames(opp_tc_df) <- c("Val_Name", "Count")
opp_tc_df <- arrange(opp_tc_df, -Count)
opp_tc_df$Share <- opp_tc_df$Count/sum(opp_tc_df$Count)
opp_tc_df
# top 3
sum(opp_tc_df$Share[1:3])
# 0.6109325

end_tc <- filter(all_max_mlong, Group == "Endorse")
# Create an order to the factor based on abs frequency
end_tc_tab <- table(end_tc$Val_Name)
end_tc_df <- as.data.frame(end_tc_tab)
colnames(end_tc_df) <- c("Val_Name", "Count")
end_tc_df <- arrange(end_tc_df, -Count)
end_tc_df$Share <- end_tc_df$Count/sum(end_tc_df$Count)
end_tc_df
# top 3
sum(end_tc_df$Share[1:3])
# 0.5257732


###################################################

###################################################
# 7. Appendix F: Test of HS1 ####

## HS1: Opponents' and Endorsers' justifications will be semantic discriminable ####
# set up partition
nasis.orig <- as.numeric(
  ifelse(d.nasis.sims$Group == "Endorse", 1, 2))
#table(nasis.orig) #matches subsets

nperm = 1000 
set.seed(1234) 

# Make an empty dataframe for the results
nasis.perm.results <- data.frame(wb_eo = rep(0, nperm),
                                 wb_oe = rep(0, nperm))

for (i in 1:nperm){
  #create new code labels
  nasis_new <- sample(nasis.orig, 
                       size = length(nasis.orig), 
                        replace = F) 
  #remove all the meta-info, except index
  d.nasis_new <- select(d.nasis.sims, 
                          ID, starts_with("id_")) 
  #attach new code labels to the data
  d.nasis_new <- cbind(nasis_new, d.nasis_new)
  # Generate new subsets
  fav_new <- filter(d.nasis_new, nasis_new == 1) 
  opp_new <- filter(d.nasis_new, nasis_new == 2)   
  # Get names of docs to keep
  fav_docs <- as.character(fav_new$ID)
  opp_docs <- as.character(opp_new$ID)
  # Calculate sims
  fav_fav <- select(fav_new, ID, all_of(fav_docs)) %>%
    mutate(n_paras = ncol(.) - 2,
           #this removes the ref_text's own column and para_index
           total = rowSums(across(where(is.numeric))) - n_paras - 1,
           #this removes the value of 1 within the row where the document matches itself
           #and the value sitting in the new column n_paras, which is a count
           mean_doc_sim = total/n_paras) %>% 
    summarise(mean_sim = mean(mean_doc_sim))
  
  fav_opp <- select(fav_new, ID, all_of(opp_docs)) %>%
    mutate(n_paras = ncol(.) - 1,
           #this removes para_index only
           total = rowSums(across(where(is.numeric))) - n_paras,
           mean_doc_sim = total/n_paras) %>%
    summarise(mean_sim = mean(mean_doc_sim)) 
  
  opp_opp <- select(opp_new, ID, all_of(opp_docs)) %>%
    mutate(n_paras = ncol(.) - 2,
           total = rowSums(across(where(is.numeric))) - n_paras - 1,
           mean_doc_sim = total/n_paras) %>%
    summarise(mean_sim = mean(mean_doc_sim)) 
  
  # Calculate the 2 within-between differences in an order that corresponds to the output df
  out.row <- c((fav_fav - fav_opp),
               (opp_opp - fav_opp))
  nasis.perm.results[i,] <- out.row
}

nasis.perms <- nasis.perm.results
head(nasis.perms)

# False-positive probability (1 / perms is max sensitivity)  
nasis.fp <- c(sum(nasis.perms$wb_eo > wb_eo)/nperm,
                sum(nasis.perms$wb_oe > wb_oe)/nperm)
nasis.fp
# 0.076 0.000
# Partial confirmation of HS1:
# Oppose distinct from Endorse (0 values > wb_oe)
# Endorse is not as coherent (76 values > wb_eo) 

# Result: Oppose view is semantically distinguishable 
# from Endorse; but reverse is not true at false positive rate < 0.05



###################################################

###################################################
# 8. Generate Figure 4 ####

## Figure 4a: paired points ####

# Plot opponents difference in sem-sim
opp_ids <- filter(nasis_arg_sim, 
                  Group == "Oppose")$ID
opp_OR <- data.frame(Sim = opp_OR_sim_avg,
                     Value = c("Self-Direction (Action) \nand Universalism (Concern)"))
opp_Misc <- data.frame(Sim = opp_OtherVals_sim_avg,
                     Value = c("All Other \nValues"))
opp_sims <- rbind(opp_OR, opp_Misc)
opp_df <- data.frame(ID = rep(opp_ids, 2), 
                     opp_sims)
str(opp_df) 
opp_df_means <- data.frame(Value = c("Self-Direction (Action) \nand Universalism (Concern)",
                                     "All Other \nValues"),
                           Sim = c(mean(opp_OR_sim_avg),
                                   mean(opp_OtherVals_sim_avg)))
opp_df_means$Value <- factor(opp_df_means$Value, 
                             levels = c("Self-Direction (Action) \nand Universalism (Concern)",
                                        "All Other \nValues"))

figH4a <- ggplot(opp_df, aes(x = Value,
                            y = Sim),) +
  geom_point(color = "darkgrey", 
              alpha=0.8) +
  geom_line(aes(group = ID),
            color="lightgrey", alpha=0.5) +
  geom_point(data = opp_df_means, 
             aes(x = Value,
                 y = Sim),
             color = "black",
             shape = 18,
             size = 3) +
  labs(title="Opponent Justifications and \nValue Annotations (N=311)",
       subtitle = "Paired points represent a single justification; \nblack diamonds represent means",
       x = "",
       y = "Average Semantic Similarity") +
  theme_bw() +
  theme(axis.title.x = element_blank(),
        axis.title.y = element_text(size = 16),
        strip.text = element_text(size = 14),
        axis.text = element_text(size = 14),
        title = element_text(size = 18))
figH4a

## Figure 4b: Stacked Bars ####
all_max_vlong <- tidyr::pivot_longer(all_max_vals,
                                     cols = SelfDirection_action:Universalism_object,
                                     names_to = "Value",
                                     values_to = "Count")
all_max_vlong
all_max_vlong$Share <- ifelse(all_max_vlong$Group == "Endorse",
                              all_max_vlong$Count/291,
                              all_max_vlong$Count/311)

empty_vals <- c("Conformity_inter", "Hedonism", "Stimulation")

# Drop value categories with no hits
all_max_vlong <- filter(all_max_vlong, ! Value %in% empty_vals)
# Consolidate all other values
all_max_vlong$Category <- ifelse(all_max_vlong$Value %in% c("SelfDirection_action",
                                                            "Universalism_concern",
                                                            "Humility"),
                                 all_max_vlong$Value, "Other")

# Aggregate into buckets
max_vlong_3cat <- all_max_vlong %>%
  select(-Count, -Value) %>%
  group_by(Group, Category) %>%
  summarise(Share_Cat = sum(Share))
max_vlong_3cat
max_vlong_3cat$Category <- factor(max_vlong_3cat$Category,
                                  levels = c("SelfDirection_action",
                                             "Universalism_concern",
                                             "Humility",
                                             "Other"),
                                  labels = c("Self-Direction \n(Action)",
                                             "Universalism \n(Concern)",
                                             "Humility",
                                             "All Other Values"))
max_vlong_3cat$Group <- factor(max_vlong_3cat$Group,
                               levels = c("Oppose", "Endorse"))
max_vlong_3cat$Text_Color <- c("white", "white", "black", "black",
                               "white", "white", "black", "black")
 
figH4b <- ggplot(max_vlong_3cat,
                 aes(x = Group,
                     y = Share_Cat*100,
                     fill = Category,
                     label = round(Share_Cat*100, 1))) +
  geom_col(color = "black") + 
  geom_text(aes(color = Text_Color),
            position=position_stack(vjust = 0.5),
            size = 4) +
  labs(title = "Most Similar Unique Value \nAnnotation",
       subtitle = "311 Opponents; 291 Endorsers",
       y = "Share of Total Justifications (%)") +
  scale_fill_manual(values = c("#f7f7f7", "#cccccc", "#969696", "#525252")) +
  scale_color_manual(values = c("black", "white")) +
  guides(color = "none",
         fill=guide_legend(nrow=2,byrow=TRUE)) + 
  theme_bw() +
  theme(axis.title.x = element_blank(),
        axis.title.y = element_text(size = 16),
        axis.text = element_text(size = 14),
        title = element_text(size = 18),
        legend.position = "bottom",
        legend.title = element_blank(),
        legend.spacing.x = unit(3, "mm"))

figH4b


## Figure 4c: Max Similarities for Table ####

# Text cannot be shared for all items, but these 4 are presented in the paper
exp.fig4c <- c("id_335", "id_412", "id_235", "id_258")

sims.fig4c <- filter(all_max_args, ID %in% exp.fig4c)
sims.fig4c

###################################################

###################################################
# 9. OPTIONAL: Regenerate Embeddings for HVBA data ####

## Set up Python ####
# library(reticulate) 

# A Python installation (3 or higher) is required.  See reticulate's documentation for
# how to set up Python for the first time
# The pip installations commands for the primary modules (packages) are: 
#!pip install transformers==3.1
#!pip3 install seaborn
# Make sure that these modules and their dependencies are stored in a single virtual environment

# Identify the python environment to use.  I use Anaconda and a virtual env called "nlp"
# for this project.  Replace "nlp" with the name of your virtual env where the necessary
# modules are installed
use_condaenv("nlp")

# Import the necessary modules 
py_run_string("from sentence_transformers import SentenceTransformer")
py_run_string("import pandas as pd")
py_run_string("import numpy as np")

# Import the particular model from Sentence-Transformers
# https://huggingface.co/sentence-transformers

# Model to run:
# current bench: stsb-mpnet-base-v2
py_run_string("model = SentenceTransformer('stsb-mpnet-base-v2')")

## Generate HVBA Embeddings ####

### 1Dim ####
py_run_string("df = pd.read_csv('Args_Unique_df.csv')")
py_run_string("sentences = df['premise'].values")
py_run_string("sentence_embeddings = model.encode(sentences)")

# py_run_string("np.savetxt('Args_Unique_df_embeddings.csv', sentence_embeddings, delimiter=',')")

### 2Dim ####
py_run_string("df = pd.read_csv('Args_2Dims_df.csv')")
py_run_string("sentences = df['premise'].values")
py_run_string("sentence_embeddings = model.encode(sentences)")

# py_run_string("np.savetxt('Args_2Dims_df_embeddings.csv', sentence_embeddings, delimiter=',')")


###################################################

